import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import zscore
df=pd.read_csv('BC.csv')
df
| mean_radius | mean_texture | mean_perimeter | mean_area | mean_smoothness | diagnosis | |
|---|---|---|---|---|---|---|
| 0 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0 |
| 1 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0 |
| 2 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0 |
| 3 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0 |
| 4 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0 |
| ... | ... | ... | ... | ... | ... | ... |
| 564 | 21.56 | 22.39 | 142.00 | 1479.0 | 0.11100 | 0 |
| 565 | 20.13 | 28.25 | 131.20 | 1261.0 | 0.09780 | 0 |
| 566 | 16.60 | 28.08 | 108.30 | 858.1 | 0.08455 | 0 |
| 567 | 20.60 | 29.33 | 140.10 | 1265.0 | 0.11780 | 0 |
| 568 | 7.76 | 24.54 | 47.92 | 181.0 | 0.05263 | 1 |
569 rows × 6 columns
df.head()
| mean_radius | mean_texture | mean_perimeter | mean_area | mean_smoothness | diagnosis | |
|---|---|---|---|---|---|---|
| 0 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0 |
| 1 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0 |
| 2 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0 |
| 3 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0 |
| 4 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 569 entries, 0 to 568 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mean_radius 569 non-null float64 1 mean_texture 569 non-null float64 2 mean_perimeter 569 non-null float64 3 mean_area 569 non-null float64 4 mean_smoothness 569 non-null float64 5 diagnosis 569 non-null int64 dtypes: float64(5), int64(1) memory usage: 26.8 KB
df.describe()
| mean_radius | mean_texture | mean_perimeter | mean_area | mean_smoothness | diagnosis | |
|---|---|---|---|---|---|---|
| count | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 |
| mean | 14.127292 | 19.289649 | 91.969033 | 654.889104 | 0.096360 | 0.627417 |
| std | 3.524049 | 4.301036 | 24.298981 | 351.914129 | 0.014064 | 0.483918 |
| min | 6.981000 | 9.710000 | 43.790000 | 143.500000 | 0.052630 | 0.000000 |
| 25% | 11.700000 | 16.170000 | 75.170000 | 420.300000 | 0.086370 | 0.000000 |
| 50% | 13.370000 | 18.840000 | 86.240000 | 551.100000 | 0.095870 | 1.000000 |
| 75% | 15.780000 | 21.800000 | 104.100000 | 782.700000 | 0.105300 | 1.000000 |
| max | 28.110000 | 39.280000 | 188.500000 | 2501.000000 | 0.163400 | 1.000000 |
df.isnull().sum()
mean_radius 0 mean_texture 0 mean_perimeter 0 mean_area 0 mean_smoothness 0 diagnosis 0 dtype: int64
df.tail()
| mean_radius | mean_texture | mean_perimeter | mean_area | mean_smoothness | diagnosis | |
|---|---|---|---|---|---|---|
| 564 | 21.56 | 22.39 | 142.00 | 1479.0 | 0.11100 | 0 |
| 565 | 20.13 | 28.25 | 131.20 | 1261.0 | 0.09780 | 0 |
| 566 | 16.60 | 28.08 | 108.30 | 858.1 | 0.08455 | 0 |
| 567 | 20.60 | 29.33 | 140.10 | 1265.0 | 0.11780 | 0 |
| 568 | 7.76 | 24.54 | 47.92 | 181.0 | 0.05263 | 1 |
sns.pairplot(df,hue='diagnosis')
<seaborn.axisgrid.PairGrid at 0x135689fe850>
df['diagnosis'].value_counts().plot.pie(autopct="%.2f%%")
<AxesSubplot:ylabel='diagnosis'>
plt.figure(figsize=(14,6))
corr=df.corr(method='pearson')
heatmap=sns.heatmap(corr,annot=True,vmax=1,vmin=-1,linewidths=1,linecolor='White')
plt.show()
for i in df.columns:
sns.distplot(df[i])
plt.show()
for i in df.columns:
sns.boxplot(df[i])
plt.show()
import plotly.express as px
fig = px.scatter_3d(df, x='mean_texture', y='mean_radius', z='diagnosis',
color='mean_area')
fig.show()
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,classification_report
X=df.iloc[:,:-1]
y=df['diagnosis']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=123)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)
(455, 5) (455,) (114, 5) (114,)
se=StandardScaler()
X_train=se.fit_transform(X_train)
X_test=se.fit_transform(X_test)
kn=KNeighborsClassifier()
kn.fit(X_train,y_train)
KNeighborsClassifier()
y_train_pred=kn.predict(X_train)
y_test_pred=kn.predict(X_test)
print(accuracy_score(y_train,y_train_pred))
print(accuracy_score(y_test,y_test_pred))
0.9318681318681319 0.9122807017543859
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(X_train,y_train)
LogisticRegression()
y_train_pred=lr.predict(X_train)
y_test_pred=lr.predict(X_test)
print(accuracy_score(y_train,y_train_pred))
print(accuracy_score(y_test,y_test_pred))
0.9296703296703297 0.9385964912280702
print(classification_report(y_train,y_train_pred))
print(classification_report(y_test,y_test_pred))
precision recall f1-score support
0 0.93 0.88 0.90 171
1 0.93 0.96 0.94 284
accuracy 0.93 455
macro avg 0.93 0.92 0.92 455
weighted avg 0.93 0.93 0.93 455
precision recall f1-score support
0 0.90 0.93 0.92 41
1 0.96 0.95 0.95 73
accuracy 0.94 114
macro avg 0.93 0.94 0.93 114
weighted avg 0.94 0.94 0.94 114
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
dt=DecisionTreeClassifier()
dt.fit(X_train,y_train)
DecisionTreeClassifier()
y_train_pred=dt.predict(X_train)
y_test_pred=dt.predict(X_test)
print(classification_report(y_train,y_train_pred))
print(classification_report(y_test,y_test_pred))
precision recall f1-score support
0 1.00 1.00 1.00 171
1 1.00 1.00 1.00 284
accuracy 1.00 455
macro avg 1.00 1.00 1.00 455
weighted avg 1.00 1.00 1.00 455
precision recall f1-score support
0 0.84 0.90 0.87 41
1 0.94 0.90 0.92 73
accuracy 0.90 114
macro avg 0.89 0.90 0.90 114
weighted avg 0.91 0.90 0.90 114
dt1=DecisionTreeClassifier(max_depth=6)
dt1.fit(X_train,y_train)
DecisionTreeClassifier(max_depth=6)
y_train_pred=dt1.predict(X_train)
y_test_pred=dt1.predict(X_test)
print(classification_report(y_train,y_train_pred))
print(classification_report(y_test,y_test_pred))
precision recall f1-score support
0 0.98 0.99 0.99 171
1 1.00 0.99 0.99 284
accuracy 0.99 455
macro avg 0.99 0.99 0.99 455
weighted avg 0.99 0.99 0.99 455
precision recall f1-score support
0 0.84 0.90 0.87 41
1 0.94 0.90 0.92 73
accuracy 0.90 114
macro avg 0.89 0.90 0.90 114
weighted avg 0.91 0.90 0.90 114
fig,ax=plt.subplots(figsize=(10,10))
chart=plot_tree(dt1,max_depth=6,feature_names=X.columns,filled=True,fontsize=10)